Imports & Dataset


In [9]:
import math
import numpy

In [10]:
dataset = [['dugg', 'clare', 'will', 'donald', 'deril', 'gregory', 'julia'],
           ['M', 'F', 'M', 'M', 'M', 'M', 'F'],
           ['20-30', '20-30', '20-30', '20-30', '30-40', '20-30', '5-10'],
           ['1.60-1.70', '1.70-1.80', '1.70-1.80', '1.80-1.90', '1.70-1.80', '>1.90', '<1.60']]

In [11]:
labels = ['no', 'no', 'yes', 'yes', 'no', 'no', 'yes']

Helper Functions - Decision Trees


In [12]:
def calc_entropy(labels):
    entropy = 0.0
    for i in set(labels):
        q_labels = float(labels.count(i))/len(labels)
        entropy += q_labels * math.log(1/q_labels)
    return entropy

In [13]:
def calc_split_entropy(splitted_labels):
    all_labels = [i for sublist in splitted_labels for i in sublist]
    new_entropy = 0.0
    for i in splitted_labels:
        q_split = float(len(i))/len(all_labels)
        new_entropy += q_split * calc_entropy(i)
    return new_entropy

In [14]:
def split_by(feature_index, dataset, labels):
    splitted_labels = {}
    for i, v in enumerate(dataset[feature_index]):
        if not splitted_labels.has_key(v):
            splitted_labels[v] = []
        splitted_labels[v].append(labels[i])
    return splitted_labels

In [15]:
def calc_variance(values):
    return (len(set(values))/float(len(values)))

Code


In [25]:
calc_split_entropy(split_by(3, dataset, labels).values())


Out[25]:
0.2727917864120626

In [17]:
calc_entropy(labels)


Out[17]:
0.6829081047004716

In [24]:
calc_variance(dataset[3])


Out[24]:
0.7142857142857143

In [ ]: